library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ dplyr 0.8.4
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggpubr)
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(harrietr)
## Registered S3 method overwritten by 'treeio':
## method from
## root.phylo ape
library(here)
## here() starts at /Users/giulieris/OneDrive - The University of Melbourne/R/VANANZ_phenotypes_github
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_knit$set(root.dir = here())
print(paste("My working directory is:" ,here()))
## [1] "My working directory is: /Users/giulieris/OneDrive - The University of Melbourne/R/VANANZ_phenotypes_github"
rm(list = ls())
snippy_data <- read_tsv("Ideas_Grant_2020_analysis/Raw_data/all_pair_id_snps.mask.tab") %>%
arrange(PAIR_ID)
## Parsed with column specification:
## cols(
## PAIR_ID = col_character(),
## REFERENCE = col_character(),
## ISOLATE = col_character(),
## CHROM = col_character(),
## POS = col_double(),
## TYPE = col_character(),
## REF = col_character(),
## ALT = col_character(),
## EVIDENCE = col_character(),
## FTYPE = col_character(),
## STRAND = col_character(),
## NT_POS = col_character(),
## AA_POS = col_character(),
## EFFECT = col_character(),
## LOCUS_TAG = col_character(),
## GENE = col_character(),
## PRODUCT = col_character()
## )
snippy_data
## # A tibble: 142,030 x 17
## PAIR_ID REFERENCE ISOLATE CHROM POS TYPE REF ALT EVIDENCE FTYPE STRAND
## <chr> <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 GP-0002 BPH2705.… BPH2704 cont… 32669 snp T A A:23 T:0 <NA> <NA>
## 2 GP-0002 BPH2705.… BPH2704 cont… 14 snp G T T:29 G:0 <NA> <NA>
## 3 GP-0002 BPH2705.… BPH2704 cont… 521 snp C A A:23 C:0 <NA> <NA>
## 4 GP-0002 BPH2705.… BPH2704 cont… 389 snp T A A:18 T:0 <NA> <NA>
## 5 GP-0002 BPH2705.… BPH2704 cont… 61262 snp A C C:25 A:0 CDS +
## 6 GP-0002 BPH2705.… BPH2704 cont… 59956 snp C A A:21 C:0 CDS +
## 7 GP-0002 BPH2705.… BPH2704 cont… 38034 snp C A A:41 C:0 CDS +
## 8 GP-0002 BPH2705.… BPH2704 cont… 37962 snp A C C:38 A:0 CDS +
## 9 GP-0002 BPH2705.… BPH2704 cont… 2728 snp A T T:18 A:0 <NA> <NA>
## 10 GP-0002 BPH2705.… BPH2704 cont… 2695 snp C A A:20 C:0 <NA> <NA>
## # … with 142,020 more rows, and 6 more variables: NT_POS <chr>, AA_POS <chr>,
## # EFFECT <chr>, LOCUS_TAG <chr>, GENE <chr>, PRODUCT <chr>
# modify snippy output:
# 1) generate iso1 and iso2 for merging with the phenotypic analysis
# 2) modify CHROM (contig name) to be consistent with the labelling in bed files down the track
# 3) extract mutation effects for easier interpretation
source("../Functions/aa_convert.R")
snippy_data_modified <- snippy_data %>%
mutate(iso1 = str_remove(REFERENCE, ".gbk"),
iso2 = ISOLATE,
CHROM = str_c(iso1, "_", CHROM)) %>%
separate(EFFECT,
into = c("EFFTYPE", "NUCLEOTIDE_CHANGE", "MUTATION"),
sep = "\\s",
remove = T,
extra = "merge") %>%
mutate(NUCLEOTIDE_CHANGE = str_remove(NUCLEOTIDE_CHANGE, "c."),
MUTATION = str_remove(MUTATION, "p."),
MUTATION_SHORT = aa_convert(MUTATION)) %>%
separate(AA_POS,
into = c("AA_POS", "AA_LENGTH"),
sep = "/",
remove = T) %>%
mutate_at(vars(starts_with("AA")),
as.numeric)
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 488 rows [458,
## 459, 554, 555, 940, 941, 942, 943, 944, 994, 995, 996, 997, 998, 1048, 1049,
## 1050, 1051, 1052, 5899, ...].
## Parsed with column specification:
## cols(
## `Amino acid` = col_character(),
## `Three letter code` = col_character(),
## `One letter code` = col_character()
## )
snippy_data_modified
## # A tibble: 142,030 x 23
## PAIR_ID REFERENCE ISOLATE CHROM POS TYPE REF ALT EVIDENCE FTYPE STRAND
## <chr> <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 GP-0002 BPH2705.… BPH2704 BPH2… 32669 snp T A A:23 T:0 <NA> <NA>
## 2 GP-0002 BPH2705.… BPH2704 BPH2… 14 snp G T T:29 G:0 <NA> <NA>
## 3 GP-0002 BPH2705.… BPH2704 BPH2… 521 snp C A A:23 C:0 <NA> <NA>
## 4 GP-0002 BPH2705.… BPH2704 BPH2… 389 snp T A A:18 T:0 <NA> <NA>
## 5 GP-0002 BPH2705.… BPH2704 BPH2… 61262 snp A C C:25 A:0 CDS +
## 6 GP-0002 BPH2705.… BPH2704 BPH2… 59956 snp C A A:21 C:0 CDS +
## 7 GP-0002 BPH2705.… BPH2704 BPH2… 38034 snp C A A:41 C:0 CDS +
## 8 GP-0002 BPH2705.… BPH2704 BPH2… 37962 snp A C C:38 A:0 CDS +
## 9 GP-0002 BPH2705.… BPH2704 BPH2… 2728 snp A T T:18 A:0 <NA> <NA>
## 10 GP-0002 BPH2705.… BPH2704 BPH2… 2695 snp C A A:20 C:0 <NA> <NA>
## # … with 142,020 more rows, and 12 more variables: NT_POS <chr>, AA_POS <dbl>,
## # AA_LENGTH <dbl>, EFFTYPE <chr>, NUCLEOTIDE_CHANGE <chr>, MUTATION <chr>,
## # LOCUS_TAG <chr>, GENE <chr>, PRODUCT <chr>, iso1 <chr>, iso2 <chr>,
## # MUTATION_SHORT <chr>
snippy_data_modified %>%
.$ISOLATE %>%
n_distinct()
## [1] 253
df_n_mutations <- snippy_data_modified %>%
count(PAIR_ID, sort = T)
df_n_mutations %>%
ggplot(aes(x = n)) +
# geom_histogram() +
geom_density() +
theme_bw()
distant_pairs <- df_n_mutations %>%
filter(n > 100) %>%
.$PAIR_ID
snippy_data_modified_no_distant_pairs <- snippy_data_modified %>%
filter(!PAIR_ID %in% distant_pairs)
rm(distant_pairs, df_n_mutations)
protein_clusters_data <- read_tsv("Ideas_Grant_2020_analysis/Raw_data/mutated_proteins.cd-hit.tab") %>%
rename_all(funs(str_c(., "_prot")))
## Parsed with column specification:
## cols(
## id = col_character(),
## clstr = col_double(),
## clstr_size = col_double(),
## length = col_double(),
## clstr_rep = col_double(),
## clstr_iden = col_character(),
## clstr_cov = col_character()
## )
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
nebraska_clusters <- read_tsv("Ideas_Grant_2020_analysis/Raw_data/representative_proteins_FPR3757_cd-hit.tab") %>%
mutate(source = if_else(str_detect(id, "SAUSA300"), "FPR3757", "mutated_proteins")) %>%
group_by(clstr) %>%
filter(any(str_detect(id, "BPH"))) %>%
group_by(clstr, source) %>%
arrange(desc(clstr_cov, clstr_id)) %>%
filter(!(source == "FPR3757" & row_number() > 1)) %>%
mutate(source_long = str_c(source, "_", row_number())) %>%
ungroup() %>%
select(id, clstr, source_long) %>%
pivot_wider(names_from = source_long, values_from = id) %>%
pivot_longer(cols = starts_with("mutated_proteins"), names_to = "source_long", values_to = "id_prot") %>%
drop_na(id_prot) %>%
select(id_prot, nebraska_locus_tag = FPR3757_1)
## Parsed with column specification:
## cols(
## id = col_character(),
## clstr = col_double(),
## clstr_size = col_double(),
## length = col_double(),
## clstr_rep = col_double(),
## clstr_iden = col_character(),
## clstr_cov = col_character()
## )
df_neb <- read_csv("Ideas_Grant_2020_analysis/Raw_data/nebraska_all_proteins.csv") %>%
rename(neb_mutant_id = `Strain Name`, neb_gene = `Gene name`, neb_product = `gene discription`)
## Parsed with column specification:
## cols(
## `Strain Name` = col_character(),
## plate384 = col_double(),
## Well384 = col_character(),
## `Gene name` = col_character(),
## `gene discription` = col_character(),
## nebraska_locus_tag = col_character(),
## Row384 = col_character(),
## Column384 = col_double(),
## nebraska_aa_seq = col_character()
## )
protein_clusters_data <- protein_clusters_data %>%
left_join(nebraska_clusters) %>%
group_by(clstr_prot) %>%
mutate(nebraska_locus_tag = nebraska_locus_tag[which(clstr_rep_prot == 1)]) %>%
left_join(df_neb)
## Joining, by = "id_prot"
## Joining, by = "nebraska_locus_tag"
protein_seq_data <- read_tsv("Ideas_Grant_2020_analysis/Raw_data/mutated_proteins.tab") %>%
rename_all(funs(str_c(., "_prot")))
## Parsed with column specification:
## cols(
## FASTA_ID = col_character(),
## SEQUENCE = col_character()
## )
snippy_data_modified_proteins <- snippy_data_modified_no_distant_pairs %>%
left_join(protein_clusters_data,
by = c("LOCUS_TAG" = "id_prot")) %>%
left_join(protein_seq_data,
by = c("LOCUS_TAG" = "FASTA_ID_prot"))
snippy_data_modified_proteins
## # A tibble: 104,035 x 39
## PAIR_ID REFERENCE ISOLATE CHROM POS TYPE REF ALT EVIDENCE FTYPE STRAND
## <chr> <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 GP-0002 BPH2705.… BPH2704 BPH2… 32669 snp T A A:23 T:0 <NA> <NA>
## 2 GP-0002 BPH2705.… BPH2704 BPH2… 14 snp G T T:29 G:0 <NA> <NA>
## 3 GP-0002 BPH2705.… BPH2704 BPH2… 521 snp C A A:23 C:0 <NA> <NA>
## 4 GP-0002 BPH2705.… BPH2704 BPH2… 389 snp T A A:18 T:0 <NA> <NA>
## 5 GP-0002 BPH2705.… BPH2704 BPH2… 61262 snp A C C:25 A:0 CDS +
## 6 GP-0002 BPH2705.… BPH2704 BPH2… 59956 snp C A A:21 C:0 CDS +
## 7 GP-0002 BPH2705.… BPH2704 BPH2… 38034 snp C A A:41 C:0 CDS +
## 8 GP-0002 BPH2705.… BPH2704 BPH2… 37962 snp A C C:38 A:0 CDS +
## 9 GP-0002 BPH2705.… BPH2704 BPH2… 2728 snp A T T:18 A:0 <NA> <NA>
## 10 GP-0002 BPH2705.… BPH2704 BPH2… 2695 snp C A A:20 C:0 <NA> <NA>
## # … with 104,025 more rows, and 28 more variables: NT_POS <chr>, AA_POS <dbl>,
## # AA_LENGTH <dbl>, EFFTYPE <chr>, NUCLEOTIDE_CHANGE <chr>, MUTATION <chr>,
## # LOCUS_TAG <chr>, GENE <chr>, PRODUCT <chr>, iso1 <chr>, iso2 <chr>,
## # MUTATION_SHORT <chr>, clstr_prot <dbl>, clstr_size_prot <dbl>,
## # length_prot <dbl>, clstr_rep_prot <dbl>, clstr_iden_prot <chr>,
## # clstr_cov_prot <chr>, nebraska_locus_tag <chr>, neb_mutant_id <chr>,
## # plate384 <dbl>, Well384 <chr>, neb_gene <chr>, neb_product <chr>,
## # Row384 <chr>, Column384 <dbl>, nebraska_aa_seq <chr>, SEQUENCE_prot <chr>
col_names <- c("CHROM_intergenic",
"START",
"END",
"CHROM_protein",
"SOURCE",
"TYPE",
"START_flank_prot",
"END_flank_prot",
"SCORE",
"STRAND_flank_prot",
"PHASE",
"ATTRIBUTES_flank_prot",
"CHROM_mutation",
"POS_minus1",
"POS",
"PAIR_ID",
"REFERENCE",
"ISOLATE")
intergenic_regions_annotated <- read_tsv("Ideas_Grant_2020_analysis/Raw_data/all_mutated.intergenic.annotated.bed",
col_names = col_names) %>%
select(CHROM = CHROM_intergenic,
START,
END,
START_flank_prot,
END_flank_prot,
STRAND_flank_prot,
ATTRIBUTES_flank_prot,
POS,
PAIR_ID,
REFERENCE,
ISOLATE)
## Parsed with column specification:
## cols(
## CHROM_intergenic = col_character(),
## START = col_double(),
## END = col_double(),
## CHROM_protein = col_character(),
## SOURCE = col_character(),
## TYPE = col_character(),
## START_flank_prot = col_double(),
## END_flank_prot = col_double(),
## SCORE = col_character(),
## STRAND_flank_prot = col_character(),
## PHASE = col_double(),
## ATTRIBUTES_flank_prot = col_character(),
## CHROM_mutation = col_character(),
## POS_minus1 = col_double(),
## POS = col_double(),
## PAIR_ID = col_character(),
## REFERENCE = col_character(),
## ISOLATE = col_character()
## )
upstream_proteins <- intergenic_regions_annotated %>%
group_by(CHROM, START, END, POS, PAIR_ID, REFERENCE, ISOLATE) %>%
filter(END_flank_prot < POS) %>%
rename_at(vars(ends_with("_flank_prot")),
funs(str_c(., "_upstream")))
downstream_proteins <- intergenic_regions_annotated %>%
group_by(CHROM, START, END, POS, PAIR_ID, REFERENCE, ISOLATE) %>%
filter(START_flank_prot > POS) %>%
rename_at(vars(ends_with("_flank_prot")),
funs(str_c(., "_downstream")))
intergenic_regions_annotated <- upstream_proteins %>%
left_join(downstream_proteins)
## Joining, by = c("CHROM", "START", "END", "POS", "PAIR_ID", "REFERENCE", "ISOLATE")
intergenic_regions_annotated
## # A tibble: 42,815 x 15
## # Groups: CHROM, START, END, POS, PAIR_ID, REFERENCE, ISOLATE [42,815]
## CHROM START END START_flank_pro… END_flank_prot_… STRAND_flank_pr…
## <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 BPH2… 213895 214187 212549 213895 +
## 2 BPH2… 162407 162758 162189 162407 -
## 3 BPH2… 162407 162758 162189 162407 -
## 4 BPH2… 162407 162758 162189 162407 -
## 5 BPH2… 174634 175871 174428 174634 +
## 6 BPH2… 174634 175871 174428 174634 +
## 7 BPH2… 178465 178709 178217 178465 +
## 8 BPH2… 181179 181441 180931 181179 +
## 9 BPH2… 231689 232169 230541 231689 +
## 10 BPH2… 98412 98874 97246 98412 -
## # … with 42,805 more rows, and 9 more variables:
## # ATTRIBUTES_flank_prot_upstream <chr>, POS <dbl>, PAIR_ID <chr>,
## # REFERENCE <chr>, ISOLATE <chr>, START_flank_prot_downstream <dbl>,
## # END_flank_prot_downstream <dbl>, STRAND_flank_prot_downstream <chr>,
## # ATTRIBUTES_flank_prot_downstream <chr>
rm(upstream_proteins, downstream_proteins)
intergenic_clusters_data <- read_tsv("Ideas_Grant_2020_analysis/Raw_data/all_mutated.intergenic.cd-hit.tab") %>%
separate(id, into = c("CHROM", "START", "END"), sep = "[:-]", remove = F) %>%
rename_at(vars(id, starts_with("clstr")),
funs(str_c(., "_intergenic"))) %>%
mutate_at(vars(START, END), as.numeric) %>%
distinct()
## Parsed with column specification:
## cols(
## id = col_character(),
## clstr = col_double(),
## clstr_size = col_double(),
## length = col_double(),
## clstr_rep = col_double(),
## clstr_iden = col_character(),
## clstr_cov = col_character()
## )
snippy_data_modified_proteins_intergenic_regions <- snippy_data_modified_proteins %>%
left_join(intergenic_regions_annotated,
by = c("CHROM", "POS", "PAIR_ID", "REFERENCE", "ISOLATE")) %>%
left_join(intergenic_clusters_data)
## Joining, by = c("CHROM", "START", "END")
df_phenotypes <- read_csv("Ideas_Grant_2020_analysis/Genetic_pairs_table/genetic_pairs_pheno_changes_mortality_switches.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## iso2 = col_character(),
## iso1 = col_character(),
## iso1_ST = col_character(),
## iso1_strain_group = col_character(),
## iso1_mortality = col_character(),
## iso1_included = col_logical(),
## iso1_scv = col_logical(),
## iso1_persister_type = col_character(),
## iso1_sample_type = col_character(),
## iso1_genes = col_character(),
## iso1_CC = col_character(),
## iso2_ST = col_character(),
## iso2_strain_group = col_character(),
## iso2_mortality = col_character(),
## iso2_included = col_logical(),
## iso2_scv = col_logical(),
## iso2_persister_type = col_character(),
## iso2_sample_type = col_character(),
## iso2_genes = col_character(),
## iso2_CC = col_character()
## # ... with 2 more columns
## )
## See spec(...) for full column specifications.
# check available phenotypes
# mortality
df_phenotypes %>%
select(iso1, iso2, switches) %>%
distinct() %>%
count(switches)
## # A tibble: 5 x 2
## switches n
## <chr> <int>
## 1 Died-Died 42
## 2 Died-Survived 20
## 3 Survived-Died 20
## 4 Survived-Survived 146
## 5 <NA> 6
# most mortality phenotypes are not available. Need to recalculate that
df_all_genetic_pairs_pheno <- read_csv("Ideas_Grant_2020_analysis/Genetic_pairs_table/df_all_genetic_pairs_pheno.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## iso2 = col_character(),
## iso1 = col_character(),
## iso1_ST = col_character(),
## iso1_strain_group = col_character(),
## iso1_mortality = col_character(),
## iso1_included = col_logical(),
## iso1_scv = col_logical(),
## iso1_persister_type = col_character(),
## iso1_sample_type = col_character(),
## iso1_genes = col_character(),
## iso1_CC = col_character(),
## iso2_ST = col_character(),
## iso2_strain_group = col_character(),
## iso2_mortality = col_character(),
## iso2_included = col_logical(),
## iso2_scv = col_logical(),
## iso2_persister_type = col_character(),
## iso2_sample_type = col_character(),
## iso2_genes = col_character(),
## iso2_CC = col_character()
## # ... with 1 more columns
## )
## See spec(...) for full column specifications.
df_all_genetic_pairs_pheno %>%
select(iso1, iso2, iso1_mortality, iso2_mortality) %>%
filter(is.na(iso1_mortality) | is.na(iso2_mortality))
## # A tibble: 90 x 4
## iso1 iso2 iso1_mortality iso2_mortality
## <chr> <chr> <chr> <chr>
## 1 BPH2751 BPH2750 Died <NA>
## 2 BPH2750 BPH2751 <NA> Died
## 3 BPH2888 BPH2887 Died <NA>
## 4 BPH2887 BPH2888 <NA> Died
## 5 BPH2898 BPH2897 Died <NA>
## 6 BPH2897 BPH2898 <NA> Died
## 7 BPH2966 BPH2965 Died <NA>
## 8 BPH2965 BPH2966 <NA> Died
## 9 BPH2988 BPH2987 Died <NA>
## 10 BPH2987 BPH2988 <NA> Died
## # … with 80 more rows
df_all_genetic_pairs_pheno_switches <- df_all_genetic_pairs_pheno %>%
mutate(switches = str_c(iso1_mortality, "-", iso2_mortality))
# count
df_all_genetic_pairs_pheno_switches %>%
select(iso1, iso2, switches) %>%
distinct() %>%
count(switches)
## # A tibble: 5 x 2
## switches n
## <chr> <int>
## 1 Died-Died 68
## 2 Died-Survived 316
## 3 Survived-Died 316
## 4 Survived-Survived 1694
## 5 <NA> 90
# df_mutations_phenotypes <- snippy_data_modified_proteins_intergenic_regions %>%
# left_join(df_phenotypes)
No intergenic for now until we have fixed the merging issues above
df_mutations_phenotypes_mortality <- snippy_data_modified_proteins %>%
left_join(df_all_genetic_pairs_pheno_switches)
## Joining, by = c("iso1", "iso2")
# convergence_all <- df_mutations_phenotypes_mortality %>%
# group_by(clstr_prot) %>%
# summarise(GENE = str_c(unique(GENE), collapse = "|"),
# n = n_distinct(PAIR_ID),
# switches = str_c(switches, collapse = "|"),
# mutations = str_c(unique(MUTATION_SHORT), collapse = "|"))
# mortality_convergence <- df_mutations_phenotypes_mortality %>%
# group_by(switches, clstr_prot) %>%
# summarise(n = n_distinct(PAIR_ID),
# GENE = str_c(unique(GENE), collapse = "|"),
# pairs = str_c(str_c(iso1, "-", iso2), collapse = "|"),
# mutations = str_c(unique(MUTATION_SHORT), collapse = "|"))
# df_phenotypes_no_dup <- df_phenotypes %>%
# rowwise() %>%
# mutate(key = paste(sort(c(iso1, iso2, switches)), collapse = "")) %>%
# #select(iso1, iso2, switches, key)
# distinct(key, .keep_all = T) %>%
# ungroup()
#
# df_mutations_phenotypes_no_dup <- snippy_data_modified_proteins %>%
# right_join(df_phenotypes_no_dup)
#
# convergence_all <- df_mutations_phenotypes_no_dup %>%
# drop_na(clstr_prot) %>%
# group_by(clstr_prot, clstr_size_prot) %>%
# summarise(gene = str_c(unique(GENE), collapse = "|"),
# n = n_distinct(PAIR_ID),
# switches = str_c(switches, collapse = "|"))
#
# convergence_mortality <- df_mutations_phenotypes_no_dup %>%
# drop_na(clstr_prot) %>%
# filter(switches %in% c("Survived-Died", "Died-Survived")) %>%
# group_by(clstr_prot, clstr_size_prot) %>%
# summarise(gene = str_c(unique(GENE), collapse = "|"),
# product = str_c(unique(PRODUCT), collapse = "|"),
# mutations = str_c(unique(MUTATION_SHORT), collapse = "|"),
# n = n_distinct(PAIR_ID), n_references = n_distinct(REFERENCE))
#
# convergence_mortality_control <- df_mutations_phenotypes_no_dup %>%
# drop_na(clstr_prot) %>%
# filter(switches %in% c("Survived-Survived", "Died-Died")) %>%
# group_by(clstr_prot, clstr_size_prot) %>%
# summarise(gene = str_c(unique(GENE), collapse = "|"),
# product = str_c(unique(PRODUCT), collapse = "|"),
# mutations = str_c(unique(MUTATION_SHORT), collapse = "|"),
# n = n_distinct(PAIR_ID), n_references = n_distinct(REFERENCE))
We need a non-symmetric dataset
require(harrietr)
snp.dist.mat <- read.csv("Data_analysis/Genetic_pairs_analysis/VANANZ_core.aln.dist.mat", sep = "\t") %>%
as.matrix(.)
str(snp.dist.mat)
## chr [1:844, 1:845] "BPH2700" "BPH2701" "BPH2702" "BPH2703" "BPH2704" ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:845] "snp.dists.0.6.3" "BPH2700" "BPH2701" "BPH2702" ...
# put 1st column as row name and remove 1st column
row.names(snp.dist.mat) <- snp.dist.mat[,1]
snp.dist.mat <- snp.dist.mat[,2:845]
str(snp.dist.mat)
## chr [1:844, 1:844] " 0" "18224" " 9375" " 9267" "25534" "25539" "29327" ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:844] "BPH2700" "BPH2701" "BPH2702" "BPH2703" ...
## ..$ : chr [1:844] "BPH2700" "BPH2701" "BPH2702" "BPH2703" ...
snp.dist.df <- harrietr::melt_dist(snp.dist.mat) %>%
mutate(dist = as.integer(dist)) %>%
filter(iso1 != "Reference") # remove reference
## Warning: attributes are not identical across measure variables;
## they will be dropped
str(snp.dist.df)
## 'data.frame': 355501 obs. of 3 variables:
## $ iso1: chr "BPH2701" "BPH2702" "BPH2703" "BPH2704" ...
## $ iso2: chr "BPH2700" "BPH2700" "BPH2700" "BPH2700" ...
## $ dist: int 18224 9375 9267 25534 25539 29327 9232 9190 9210 11106 ...
close_strain.df <- snp.dist.df %>%
filter(dist <= 30)
nrow(close_strain.df)
## [1] 1242
# check that there are no duplicate pairs
df_close_pairs <- close_strain.df %>%
distinct(iso1, iso2)
nrow(df_close_pairs)
## [1] 1242
# df_mutations_phenotypes_mortality_non_symmetric <- df_mutations_phenotypes_mortality %>%
# right_join(close_strain.df)
#
# df_convergence_mortality_non_symmetric <- df_mutations_phenotypes_mortality %>%
# drop_na(clstr_prot) %>%
# filter(switches %in% c("Survived-Died", "Died-Survived")) %>%
# group_by(clstr_prot, clstr_size_prot) %>%
# summarise(gene = str_c(unique(GENE), collapse = "|"),
# product = str_c(unique(PRODUCT), collapse = "|"),
# mutations = str_c(unique(MUTATION_SHORT), collapse = "|"),
# n = n_distinct(PAIR_ID), n_references = n_distinct(REFERENCE))
#
# df_convergence_mortality_non_symmetric_long <- df_mutations_phenotypes_mortality %>%
# drop_na(clstr_prot) %>%
# filter(switches %in% c("Survived-Died", "Died-Survived")) %>%
# filter(EFFTYPE != "synonymous_variant") %>%
# group_by(clstr_prot, clstr_size_prot) %>%
# mutate(n = n_distinct(PAIR_ID),
# n_references = n_distinct(REFERENCE)) %>%
# select(n_references, n, everything()) %>%
# filter(n_references > 1)
df_mortality_close_pairs_indep <- df_all_genetic_pairs_pheno_switches %>%
right_join(df_close_pairs) %>%
filter(switches %in% c("Survived-Died", "Died-Survived")) %>%
arrange(iso1) %>%
mutate(same_iso1 = if_else(row_number() == 1, FALSE, lag(iso1) == iso1)) %>%
filter(!same_iso1) %>%
arrange(iso2) %>%
mutate(same_iso2 = if_else(row_number() == 1, FALSE, lag(iso2) == iso2)) %>%
filter(!same_iso2) %>%
select(iso1, same_iso1, iso2, same_iso2, everything())
## Joining, by = c("iso2", "iso1")
df_mutations_phenotypes_mortality_indep <- snippy_data_modified_proteins %>%
right_join(df_mortality_close_pairs_indep)
## Joining, by = c("iso1", "iso2")
df_convergence_mortality_indep <- df_mutations_phenotypes_mortality_indep %>%
drop_na(PRODUCT) %>%
filter(EFFTYPE != "synonymous_variant") %>%
group_by(clstr_prot, clstr_size_prot) %>%
mutate(n = n_distinct(PAIR_ID),
n_references = n_distinct(REFERENCE)) %>%
select(n_references, n, clstr_prot, clstr_size_prot, GENE, PRODUCT, MUTATION_SHORT, everything()) %>%
arrange(clstr_prot, desc(n_references))
df_convergence_mortality_indep
## # A tibble: 524 x 116
## # Groups: clstr_prot, clstr_size_prot [403]
## n_references n clstr_prot clstr_size_prot GENE PRODUCT MUTATION_SHORT
## <int> <int> <dbl> <dbl> <chr> <chr> <chr>
## 1 3 3 0 61 ebh Extrac… A2602E
## 2 3 3 0 61 ebh Extrac… F139S
## 3 3 3 0 61 ebh Extrac… Q7253fs
## 4 1 1 5 23 <NA> hypoth… F1903L
## 5 3 3 7 73 tycC Tyroci… L1024fs
## 6 3 3 7 73 tycC Tyroci… E1333K
## 7 3 3 7 73 tycC Tyroci… K2308fs
## 8 1 1 11 6 <NA> hypoth… S1502N
## 9 1 1 13 54 <NA> hypoth… G1212S
## 10 1 1 15 24 essC Type V… G1038D
## # … with 514 more rows, and 109 more variables: PAIR_ID <chr>, REFERENCE <chr>,
## # ISOLATE <chr>, CHROM <chr>, POS <dbl>, TYPE <chr>, REF <chr>, ALT <chr>,
## # EVIDENCE <chr>, FTYPE <chr>, STRAND <chr>, NT_POS <chr>, AA_POS <dbl>,
## # AA_LENGTH <dbl>, EFFTYPE <chr>, NUCLEOTIDE_CHANGE <chr>, MUTATION <chr>,
## # LOCUS_TAG <chr>, iso1 <chr>, iso2 <chr>, length_prot <dbl>,
## # clstr_rep_prot <dbl>, clstr_iden_prot <chr>, clstr_cov_prot <chr>,
## # nebraska_locus_tag <chr>, neb_mutant_id <chr>, plate384 <dbl>,
## # Well384 <chr>, neb_gene <chr>, neb_product <chr>, Row384 <chr>,
## # Column384 <dbl>, nebraska_aa_seq <chr>, SEQUENCE_prot <chr>,
## # same_iso1 <lgl>, same_iso2 <lgl>, dist <dbl>, iso1_ST <chr>,
## # iso1_strain_group <chr>, iso1_mecA <dbl>, iso1_mortality <chr>,
## # iso1_intrahost_index <dbl>, iso1_included <lgl>,
## # iso1_unrelated_to_index <dbl>, iso1_mut_count <dbl>, iso1_scv <lgl>,
## # iso1_vanco_difference <dbl>, iso1_persister_type <chr>,
## # iso1_intrahost_sampledelay <dbl>, iso1_sample_type <chr>, iso1_genes <chr>,
## # iso1_CC <chr>, iso2_ST <chr>, iso2_strain_group <chr>, iso2_mecA <dbl>,
## # iso2_mortality <chr>, iso2_intrahost_index <dbl>, iso2_included <lgl>,
## # iso2_unrelated_to_index <dbl>, iso2_mut_count <dbl>, iso2_scv <lgl>,
## # iso2_vanco_difference <dbl>, iso2_persister_type <chr>,
## # iso2_intrahost_sampledelay <dbl>, iso2_sample_type <chr>, iso2_genes <chr>,
## # iso2_CC <chr>, CC <chr>, iso1_time_of_max_rate_OD <dbl>,
## # iso1_max_rate_OD <dbl>, iso1_doubling_time_OD <dbl>, iso1_AUC_OD <dbl>,
## # iso1_time_of_max_OD <dbl>, iso1_max_OD <dbl>, iso1_time_of_min_OD <dbl>,
## # iso1_min_OD <dbl>, iso1_end_point_OD <dbl>,
## # iso1_end_point_timepoint_OD <dbl>, iso1_AUC_death <dbl>,
## # iso1_time_of_max_death <dbl>, iso1_max_death <dbl>,
## # iso1_time_of_min_death <dbl>, iso1_min_death <dbl>,
## # iso1_time_of_max_rate_death <dbl>, iso1_max_rate_death <dbl>,
## # iso1_doubling_time_death <dbl>, iso1_end_point_death <dbl>,
## # iso1_end_point_timepoint_death <dbl>, iso2_time_of_max_rate_OD <dbl>,
## # iso2_max_rate_OD <dbl>, iso2_doubling_time_OD <dbl>, iso2_AUC_OD <dbl>,
## # iso2_time_of_max_OD <dbl>, iso2_max_OD <dbl>, iso2_time_of_min_OD <dbl>,
## # iso2_min_OD <dbl>, iso2_end_point_OD <dbl>,
## # iso2_end_point_timepoint_OD <dbl>, iso2_AUC_death <dbl>,
## # iso2_time_of_max_death <dbl>, …
n_distinct(df_convergence_mortality_indep$PAIR_ID)
## [1] 22
t_mortality_indep <- df_convergence_mortality_indep %>%
distinct(n_references, n, clstr_prot, clstr_size_prot, GENE, PRODUCT, MUTATION_SHORT, nebraska_locus_tag, neb_mutant_id, neb_gene, neb_product) %>%
group_by(n_references, clstr_prot, clstr_size_prot, nebraska_locus_tag, neb_mutant_id, neb_gene, neb_product) %>%
summarise_at(vars(GENE, PRODUCT, MUTATION_SHORT),
funs(str_c(unique(.), collapse = "|")))
df_mortality_close_pairs_indep_controls <- df_all_genetic_pairs_pheno_switches %>%
right_join(df_close_pairs) %>%
drop_na(switches) %>%
filter(!switches %in% c("Survived-Died", "Died-Survived")) %>%
arrange(iso1) %>%
mutate(same_iso1 = if_else(row_number() == 1, FALSE, lag(iso1) == iso1)) %>%
filter(!same_iso1) %>%
arrange(iso2) %>%
mutate(same_iso2 = if_else(row_number() == 1, FALSE, lag(iso2) == iso2)) %>%
filter(!same_iso2) %>%
select(iso1, same_iso1, iso2, same_iso2, everything())
## Joining, by = c("iso2", "iso1")
df_mutations_phenotypes_mortality_indep_controls <- snippy_data_modified_proteins %>%
right_join(df_mortality_close_pairs_indep)
## Joining, by = c("iso1", "iso2")
n_distinct(df_mutations_phenotypes_mortality_indep_controls$PAIR_ID)
## [1] 23
df_convergence_mortality_controls <- df_mutations_phenotypes_mortality_indep %>%
drop_na(PRODUCT) %>%
filter(EFFTYPE != "synonymous_variant") %>%
group_by(clstr_prot, clstr_size_prot) %>%
mutate(n = n_distinct(PAIR_ID),
n_references = n_distinct(REFERENCE)) %>%
select(n_references, n, clstr_prot, clstr_size_prot, GENE, PRODUCT, MUTATION_SHORT, everything()) %>%
arrange(clstr_prot, desc(n_references))
t_controls <- df_convergence_mortality_controls %>%
distinct(n_references, n, clstr_prot, clstr_size_prot, GENE, PRODUCT, MUTATION_SHORT) %>%
group_by(n_references, clstr_prot, clstr_size_prot) %>%
summarise_at(vars(GENE, PRODUCT, MUTATION_SHORT),
funs(str_c(unique(.), collapse = "|")))
unique mutations for each lethal strain
tree with mutations
as many switches as lethal strains
df_mortality_close_clusters <- df_all_genetic_pairs_pheno_switches %>%
arrange(iso2) %>%
filter(switches %in% c("Survived-Died")) %>%
group_by(iso2) %>%
mutate(iso_cluster = str_c(iso2, "-cluster")) %>%
# rowwise() %>%
# mutate(key = str_c(sort(c(iso1, iso2)), collapse = "")) %>%
# select(iso1, iso2, switches, iso_cluster, key) %>%
# distinct(key, .keep_all = T) %>%
select(iso1, iso2, switches, iso_cluster)
df_mortality_close_clusters %>%
.$iso_cluster %>%
n_distinct()
## [1] 28
df_mortality_close_clusters %>%
ggplot(aes(x = fct_infreq(iso_cluster))) +
geom_bar() +
coord_flip() +
labs(x = "") +
theme_bw()
df_mutations_phenotypes_mortality_close_clusters <- snippy_data_modified_proteins %>%
right_join(df_mortality_close_clusters)
## Joining, by = c("iso1", "iso2")
df_convergence_mortality_clusters <- df_mutations_phenotypes_mortality_close_clusters %>%
drop_na(PRODUCT) %>%
filter(EFFTYPE != "synonymous_variant") %>%
group_by(clstr_prot, clstr_size_prot) %>%
mutate(n_clusters = n_distinct(iso_cluster),
n_pairs = n_distinct(PAIR_ID),
n_references = n_distinct(REFERENCE)) %>%
select(n_references, n_clusters, n_pairs, clstr_prot, clstr_size_prot, GENE, PRODUCT, MUTATION_SHORT, iso_cluster, PAIR_ID, switches, everything()) %>%
arrange(clstr_prot, desc(n_clusters))
t_mortality_clusters <- df_convergence_mortality_clusters %>%
group_by(clstr_prot) %>%
mutate(SEQUENCE_prot = SEQUENCE_prot[1]) %>%
distinct(n_references, n_clusters, n_pairs, clstr_prot, clstr_size_prot, SEQUENCE_prot, GENE, PRODUCT, MUTATION_SHORT, nebraska_locus_tag, neb_mutant_id, neb_gene, neb_product) %>%
group_by(n_clusters, n_pairs, clstr_prot, clstr_size_prot, SEQUENCE_prot, nebraska_locus_tag, neb_mutant_id, neb_gene, neb_product) %>%
summarise_at(vars(GENE, PRODUCT, MUTATION_SHORT),
funs(str_c(unique(.), collapse = "|")))%>%
mutate(phenotype = "SAB mortality") %>%
arrange(desc(n_clusters))
t_mortality_clusters
## # A tibble: 689 x 13
## # Groups: n_clusters, n_pairs, clstr_prot, clstr_size_prot, SEQUENCE_prot,
## # nebraska_locus_tag, neb_mutant_id, neb_gene [689]
## n_clusters n_pairs clstr_prot clstr_size_prot SEQUENCE_prot nebraska_locus_…
## <int> <int> <dbl> <dbl> <chr> <chr>
## 1 13 25 7 73 MIMGNLRFQQEY… SAUSA300_0181
## 2 13 31 358 30 MLFHKKIESLIS… SAUSA300_2167
## 3 11 11 469 10 MKFSTLSEEEFT… SAUSA300_1141
## 4 11 13 1105 49 MSNQNYDYNKNE… SAUSA300_0754
## 5 11 17 569 20 MKIIHTADWHLG… SAUSA300_1242
## 6 11 22 211 25 MAKLLIMSIVSF… <NA>
## 7 11 54 140 56 MKFKSLITTTLA… <NA>
## 8 11 55 0 61 MNYRDKIQKFSI… SAUSA300_1327
## 9 10 17 65 27 MNMKKKEKHAIR… <NA>
## 10 10 26 435 66 MELLNRYNFVLF… SAUSA300_1991
## # … with 679 more rows, and 7 more variables: neb_mutant_id <chr>,
## # neb_gene <chr>, neb_product <chr>, GENE <chr>, PRODUCT <chr>,
## # MUTATION_SHORT <chr>, phenotype <chr>
df_phenotypes
## # A tibble: 234 x 111
## iso2 iso1 dist iso1_ST iso1_strain_gro… iso1_mecA iso1_mortality
## <chr> <chr> <dbl> <chr> <chr> <dbl> <chr>
## 1 BPH2… BPH2… 21 93 VAN-004 1 Survived
## 2 BPH2… BPH2… 21 93 VAN-004 1 Survived
## 3 BPH2… BPH2… 20 97 VAN-007 0 Survived
## 4 BPH2… BPH2… 20 97 VAN-007 0 Survived
## 5 BPH2… BPH2… 18 239 VAN-064 1 Survived
## 6 BPH2… BPH3… 16 239 VSS-287 1 Died
## 7 BPH2… BPH2… 20 239 VAN-155 1 Died
## 8 BPH2… BPH2… 29 45 VAN-011 0 Survived
## 9 BPH2… BPH2… 29 45 VAN-011 0 Survived
## 10 BPH2… BPH2… 18 239 VAN-082 1 Survived
## # … with 224 more rows, and 104 more variables: iso1_intrahost_index <dbl>,
## # iso1_included <lgl>, iso1_unrelated_to_index <dbl>, iso1_mut_count <dbl>,
## # iso1_scv <lgl>, iso1_vanco_difference <dbl>, iso1_persister_type <chr>,
## # iso1_intrahost_sampledelay <dbl>, iso1_sample_type <chr>, iso1_genes <chr>,
## # iso1_CC <chr>, iso2_ST <chr>, iso2_strain_group <chr>, iso2_mecA <dbl>,
## # iso2_mortality <chr>, iso2_intrahost_index <dbl>, iso2_included <lgl>,
## # iso2_unrelated_to_index <dbl>, iso2_mut_count <dbl>, iso2_scv <lgl>,
## # iso2_vanco_difference <dbl>, iso2_persister_type <chr>,
## # iso2_intrahost_sampledelay <dbl>, iso2_sample_type <chr>, iso2_genes <chr>,
## # iso2_CC <chr>, CC <chr>, iso1_time_of_max_rate_OD <dbl>,
## # iso1_max_rate_OD <dbl>, iso1_doubling_time_OD <dbl>, iso1_AUC_OD <dbl>,
## # iso1_time_of_max_OD <dbl>, iso1_max_OD <dbl>, iso1_time_of_min_OD <dbl>,
## # iso1_min_OD <dbl>, iso1_end_point_OD <dbl>,
## # iso1_end_point_timepoint_OD <dbl>, iso1_AUC_death <dbl>,
## # iso1_time_of_max_death <dbl>, iso1_max_death <dbl>,
## # iso1_time_of_min_death <dbl>, iso1_min_death <dbl>,
## # iso1_time_of_max_rate_death <dbl>, iso1_max_rate_death <dbl>,
## # iso1_doubling_time_death <dbl>, iso1_end_point_death <dbl>,
## # iso1_end_point_timepoint_death <dbl>, iso2_time_of_max_rate_OD <dbl>,
## # iso2_max_rate_OD <dbl>, iso2_doubling_time_OD <dbl>, iso2_AUC_OD <dbl>,
## # iso2_time_of_max_OD <dbl>, iso2_max_OD <dbl>, iso2_time_of_min_OD <dbl>,
## # iso2_min_OD <dbl>, iso2_end_point_OD <dbl>,
## # iso2_end_point_timepoint_OD <dbl>, iso2_AUC_death <dbl>,
## # iso2_time_of_max_death <dbl>, iso2_max_death <dbl>,
## # iso2_time_of_min_death <dbl>, iso2_min_death <dbl>,
## # iso2_time_of_max_rate_death <dbl>, iso2_max_rate_death <dbl>,
## # iso2_doubling_time_death <dbl>, iso2_end_point_death <dbl>,
## # iso2_end_point_timepoint_death <dbl>, delta_time_of_max_rate_OD <dbl>,
## # delta_max_rate_OD <dbl>, delta_doubling_time_OD <dbl>, delta_AUC_OD <dbl>,
## # delta_time_of_max_OD <dbl>, delta_time_of_min_OD <dbl>, delta_max_OD <dbl>,
## # delta_min_OD <dbl>, delta_end_point_OD <dbl>,
## # delta_time_of_max_rate_death <dbl>, delta_max_rate_death <dbl>,
## # delta_doubling_time_death <dbl>, delta_AUC_death <dbl>,
## # delta_time_of_max_death <dbl>, delta_time_of_min_death <dbl>,
## # delta_max_death <dbl>, delta_min_death <dbl>, delta_end_point_death <dbl>,
## # log2fc_time_of_max_rate_OD <dbl>, log2fc_max_rate_OD <dbl>,
## # log2fc_doubling_time_OD <dbl>, log2fc_AUC_OD <dbl>,
## # log2fc_time_of_max_OD <dbl>, log2fc_time_of_min_OD <dbl>,
## # log2fc_max_OD <dbl>, log2fc_min_OD <dbl>, log2fc_end_point_OD <dbl>,
## # log2fc_time_of_max_rate_death <dbl>, log2fc_max_rate_death <dbl>,
## # log2fc_doubling_time_death <dbl>, log2fc_AUC_death <dbl>,
## # log2fc_time_of_max_death <dbl>, log2fc_time_of_min_death <dbl>, …
for (var in str_subset(colnames(df_phenotypes), "iso1_.*_OD")){
p <- ggdensity(data = df_phenotypes, x = var, fill = "red") +
theme_bw()
print(p)
}
df_phenotypes %>%
ggplot(aes(x = abs(delta_AUC_OD))) +
geom_density(fill = "red") +
theme_bw()
df_plot <- df_phenotypes %>%
mutate_at(vars(matches("delta_.*_OD")),
abs)
for (var in str_subset(colnames(df_plot), "delta_.*_OD")){
p <- ggdensity(data = df_plot, x = var, fill = "red") +
theme_bw()
print(p)
}
Based on the exploration above, we define a delta AUC threshold of 30 for discordant pairs (60 would be a more stringent one)
df_growth_close_clusters <- df_phenotypes %>%
arrange(iso2) %>%
filter(delta_AUC_OD < - 10) %>%
group_by(iso2) %>%
mutate(iso_cluster = str_c(iso2, "-cluster")) %>%
ungroup() %>%
distinct(iso1, .keep_all =T) %>%
select(iso1, iso2, iso_cluster, delta_AUC_OD, iso1_AUC_OD, iso2_AUC_OD)
df_growth_close_clusters %>%
.$iso_cluster %>%
n_distinct()
## [1] 19
df_growth_close_clusters %>%
ggplot(aes(x = fct_infreq(iso_cluster))) +
geom_bar() +
coord_flip() +
labs(x = "") +
theme_bw()
df_mutations_growth_close_clusters <- snippy_data_modified_proteins %>%
right_join(df_growth_close_clusters)
## Joining, by = c("iso1", "iso2")
nrow(df_mutations_growth_close_clusters)
## [1] 1149
df_convergence_growth_clusters <- df_mutations_growth_close_clusters %>%
drop_na(PRODUCT) %>%
filter(EFFTYPE != "synonymous_variant") %>%
group_by(clstr_prot, clstr_size_prot) %>%
mutate(n_clusters = n_distinct(iso_cluster),
n_pairs = n_distinct(PAIR_ID),
n_references = n_distinct(REFERENCE)) %>%
select(n_references, n_clusters, n_pairs, clstr_prot, clstr_size_prot, GENE, PRODUCT, MUTATION_SHORT, iso_cluster, PAIR_ID, delta_AUC_OD, everything()) %>%
arrange(clstr_prot, desc(n_clusters))
t_growth_clusters <- df_convergence_growth_clusters %>%
group_by(clstr_prot) %>%
mutate(SEQUENCE_prot = SEQUENCE_prot[1]) %>%
distinct(n_references, n_clusters, n_pairs, clstr_prot, clstr_size_prot, SEQUENCE_prot, GENE, PRODUCT, MUTATION_SHORT, nebraska_locus_tag, neb_mutant_id, neb_gene, neb_product) %>%
group_by(n_clusters, n_pairs, clstr_prot, clstr_size_prot, SEQUENCE_prot, nebraska_locus_tag, neb_mutant_id, neb_gene, neb_product) %>%
summarise_at(vars(GENE, PRODUCT, MUTATION_SHORT),
funs(str_c(unique(.), collapse = "|"))) %>%
mutate(phenotype = "Growth rate (AUC)") %>%
arrange(desc(n_clusters))
for (var in str_subset(colnames(df_phenotypes), "iso1_.*_death")){
p <- ggdensity(data = df_phenotypes, x = var, fill = "blue") +
theme_bw()
print(p)
}
df_plot <- df_phenotypes %>%
mutate_at(vars(matches("delta_.*_death")),
abs)
for (var in str_subset(colnames(df_plot), "delta_.*_death")){
p <- ggdensity(data = df_plot, x = var, fill = "blue") +
theme_bw()
print(p)
}
df_cell_death_close_clusters <- df_phenotypes %>%
arrange(iso2) %>%
filter(delta_AUC_death < - 40) %>%
group_by(iso2) %>%
mutate(iso_cluster = str_c(iso2, "-cluster")) %>%
ungroup() %>%
distinct(iso1, .keep_all =T) %>%
select(iso1, iso2, iso_cluster, delta_AUC_death, iso1_AUC_death, iso2_AUC_death)
df_cell_death_close_clusters %>%
.$iso_cluster %>%
n_distinct()
## [1] 12
df_cell_death_close_clusters %>%
ggplot(aes(x = fct_infreq(iso_cluster))) +
geom_bar() +
coord_flip() +
labs(x = "") +
theme_bw()
df_mutations_cell_death_close_clusters <- snippy_data_modified_proteins %>%
right_join(df_cell_death_close_clusters)
## Joining, by = c("iso1", "iso2")
nrow(df_mutations_cell_death_close_clusters)
## [1] 838
df_convergence_cell_death_clusters <- df_mutations_cell_death_close_clusters %>%
drop_na(PRODUCT) %>%
filter(EFFTYPE != "synonymous_variant") %>%
group_by(clstr_prot, clstr_size_prot) %>%
mutate(n_clusters = n_distinct(iso_cluster),
n_pairs = n_distinct(PAIR_ID),
n_references = n_distinct(REFERENCE)) %>%
select(n_references, n_clusters, n_pairs, clstr_prot, clstr_size_prot, GENE, PRODUCT, MUTATION_SHORT, iso_cluster, PAIR_ID, delta_AUC_death, everything()) %>%
arrange(clstr_prot, desc(n_clusters))
t_cell_death_clusters <- df_convergence_cell_death_clusters %>%
group_by(clstr_prot) %>%
mutate(SEQUENCE_prot = SEQUENCE_prot[1]) %>%
distinct(n_references, n_clusters, n_pairs, clstr_prot, clstr_size_prot, SEQUENCE_prot, GENE, PRODUCT, MUTATION_SHORT, nebraska_locus_tag, neb_mutant_id, neb_gene, neb_product) %>%
group_by(n_clusters, n_pairs, clstr_prot, clstr_size_prot, SEQUENCE_prot, nebraska_locus_tag, neb_mutant_id, neb_gene, neb_product) %>%
summarise_at(vars(GENE, PRODUCT, MUTATION_SHORT),
funs(str_c(unique(.), collapse = "|"))) %>%
mutate(phenotype = "Cell death (AUC)")
t <- bind_rows(t_mortality_clusters,
t_growth_clusters,
t_cell_death_clusters)
t_convergent <- t %>%
ungroup() %>%
select(clstr_prot, starts_with("neb"), n_clusters, phenotype) %>%
pivot_wider(names_from = phenotype, names_prefix = "n_clusters_", values_from = n_clusters, values_fill = list(n_clusters = 0)) %>%
left_join(t) %>%
group_by_at(vars(clstr_prot, starts_with("n_clusters_"),
starts_with("neb"))) %>%
summarise_at(vars(GENE, PRODUCT, MUTATION_SHORT),
funs(str_c(unique(.), collapse = "|")))
## Joining, by = c("clstr_prot", "nebraska_locus_tag", "neb_mutant_id", "neb_gene", "neb_product")
write_csv(t_convergent,
"Ideas_Grant_2020_analysis/Convergence_analysis_tables/convergence_analysis_clusters_reduntant_controls.csv")
t_venn <- t_convergent %>%
ungroup() %>%
mutate_at(vars(starts_with("n_clusters")),
funs(mutated = as.integer(. > 0))) %>%
unite(col = "intersect", ends_with("mutated"), sep = "") %>%
mutate(intersect = recode(intersect,
`111` = "all",
`100` = "SAB mortality only",
`110` = "SAB mortality and growth rate",
`101` = "SAB mortality and cell death",
`011` = "Growth rate and cell death",
`010` = "Growth rate only",
`011` = "Growth rate and cell death",
`001` = "Cell death only"))
t_venn_synthesis <- t_venn %>%
count(intersect)
t_venn_stringent <- t_convergent %>%
ungroup() %>%
mutate_at(vars(starts_with("n_clusters")),
funs(mutated = as.integer(. > 1))) %>%
unite(col = "intersect", ends_with("mutated"), sep = "") %>%
filter(intersect != "000") %>%
mutate(intersect = recode(intersect,
`111` = "all",
`100` = "SAB mortality only",
`110` = "SAB mortality and growth rate",
`101` = "SAB mortality and cell death",
`011` = "Growth rate and cell death",
`010` = "Growth rate only",
`011` = "Growth rate and cell death",
`001` = "Cell death only"))
t_venn__stringent_synthesis <- t_venn_stringent %>%
count(intersect)
library(ggtree)
## ggtree v2.0.1 For help: https://yulab-smu.github.io/treedata-book/
##
## If you use ggtree in published research, please cite the most appropriate paper(s):
##
## [36m-[39m Guangchuang Yu, Tommy Tsan-Yuk Lam, Huachen Zhu, Yi Guan. Two methods for mapping and visualizing associated data on phylogeny using ggtree. Molecular Biology and Evolution 2018, 35(12):3041-3043. doi: 10.1093/molbev/msy194
## [36m-[39m Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam. ggtree: an R package for visualization and annotation of phylogenetic trees with their covariates and other associated data. Methods in Ecology and Evolution 2017, 8(1):28-36, doi:10.1111/2041-210X.12628
##
## Attaching package: 'ggtree'
## The following object is masked from 'package:ggpubr':
##
## rotate
## The following object is masked from 'package:magrittr':
##
## inset
## The following object is masked from 'package:tidyr':
##
## expand
tree <- read.tree("Ideas_Grant_2020_analysis/Raw_data/VANANZ_phenotypes_n843.tree") %>%
phytools::midpoint.root()
ggtree(tree, layout = "circular")
mortality_close_clusters_isolates_iso1 <- df_mortality_close_clusters %>%
separate(switches, into = c("iso1_mortality", "iso2_mortality")) %>%
ungroup() %>%
select(starts_with("iso1")) %>%
rename(ISOLATE = iso1, mortality = iso1_mortality)
mortality_close_clusters_isolates_iso2 <- df_mortality_close_clusters %>%
separate(switches, into = c("iso1_mortality", "iso2_mortality")) %>%
ungroup() %>%
select(starts_with("iso2")) %>%
rename(ISOLATE = iso2, mortality = iso2_mortality)
mortality_close_clusters_isolates <- bind_rows(mortality_close_clusters_isolates_iso1,
mortality_close_clusters_isolates_iso2) %>%
distinct()
ggtree(tree, layout = "circular") %<+% mortality_close_clusters_isolates +
geom_point2(aes(subset = !is.na(mortality), colour = mortality))